knitr::opts_chunk$set(
    echo = TRUE,
    message = FALSE,
    warning = FALSE
)

library(tidyverse) #used for data wrangling and viz
library(here) #simplifies file paths
library(rsample) #used to split data
library(janitor) #used to clean data names
library(corrplot) #for correlation viz
library(VIM) #for missing data viz
library(missMDA) #for imputing missing data
library(ggfortify) #for PCA viz
library(fastDummies) #for dummy coding
library(caret) #for cross validation
library(class) #for knn
library(gbm) #for boosted trees modeling
library(randomForest) #for random forest modeling
library(maptools)
library(RColorBrewer)
library(classInt)
library(ggplot2)
library(ggrepel)
library(mapproj)
library(viridis)

#turns scientific notation off
options(scipen = 100)

#some of our workflow includes randomness. here we set a seed so our workflow can be reproduced without worrying about random variation
set.seed(123) 

1 Introduction

The purpose of this project is to use machine learning techniques to predict which wildfires will grow to a catastrophic size. Our most accurate model will then be used to assess how global climate change may influence a wildfires predicted size.

1.1 Background

fire situation in North America, climate change, etc

1.2 Why this model is useful

What will it be used for?

1.3 Data and Packages Used

This dataset is available on Kaggle. It is a subset of larger fires

Important attributes include

  • fire_size - the size of the fire in acres

  • stat_cause_descr - the cause of the fire

  • vegetation - code corresponding to vegetation type

  • temp_cont - temperature (Celsius) when the fire was contained

A complete codebook is available in the project file

#read in dataset
us_wildfire <- read_csv(here("archive", "FW_Veg_Rem_Combined.csv"))
#Following to the National Geographic Education
us_wildfire$region = "NorthEast"
us_wildfire[which(us_wildfire$state %in% c("AK", "WA", "OR", "CA", "NV", "ID", "MT", "WY", "UT", "CO")),"region"] = "West"
us_wildfire[which(us_wildfire$state %in% c("AZ", "NM", "TX", "OK")), "region"] = "SouthWest"
us_wildfire[which(us_wildfire$state %in% c("ND", "SD", "NE", "KS", "MN", "IA", "MO", "WI", "IL", "MI", "IN", "OH")), "region"] = "MidWest"
us_wildfire[which(us_wildfire$state %in% c("AR", "LA", "MS", "AL", "GA", "FL", "SC", "NC", "TN", "KY", "VA", "WV")), "region"] = "SouthEast"

us_wildfire$year_diff = us_wildfire$disc_pre_year - min(us_wildfire$disc_pre_year)

2 Methods

Overview of methods

2.1 Cleaning Data

what we did to clean the data

#First, there are a couple of junk columns in the data, so we select only the columns that mean something, then we use janitor's clean names function for lowercase snake col names
us_wildfire_clean <- us_wildfire %>% 
  dplyr::select(fire_name:year_diff) %>% 
  clean_names()

#we are interested in using weather to predict fire duration, so we filter out observations that do not have a weather file
us_wildfire_clean <- us_wildfire_clean %>% 
  filter(weather_file != "File Not Found")

#Here we label vegetation according to the provided codebook
us_wildfire_clean <- us_wildfire_clean %>% 
  mutate(vegetation_classed = case_when(
    vegetation == 12 ~ "Open Shrubland",
    vegetation == 15 ~ "Polar Desert/Rock/Ice",
    vegetation == 16 ~ "Secondary Tropical Evergreen Broadleaf Forest",
    vegetation == 4 ~ "Temperate Evergreen Needleleaf Forest TmpENF",
    vegetation == 9 ~ "C3 Grassland/Steppe",
    vegetation == 14 ~ "Desert"
  ))

#According the metadata for this data set, the vegetation was created by interpolating most likely vegetation based on latitude and longitude. The most common vegetation type is listed as "Polar Desert/Rock/Ice" and this seems very unlikely.

#There are some weather observations for which every weather field is 0. this seems unlikely. so we will replace them with NA
us_wildfire_clean <- us_wildfire_clean %>%
  mutate_at(vars(temp_pre_30:hum_cont), ~na_if(., 0.0000000))
#precipitation is frequently zero, so we will not replace all zeros with NAs. Instead we will follow the pattern of NAs seen in other weather columns.

#we can see that there is a clear pattern in the missing weather data. when temp_pre_30 is missing, so is wind_pre_30 and humidity_pre_30. We will assume this extends to precipitation
us_wildfire_clean <- us_wildfire_clean %>% 
  mutate(prec_pre_30 = case_when(is.na(temp_pre_30) & is.na(hum_pre_30) & is.na(wind_pre_30) ~ NA_real_,
                                      TRUE ~ prec_pre_30)) %>% 
  mutate(prec_pre_15 = case_when(is.na(temp_pre_15) & is.na(hum_pre_15) & is.na(wind_pre_15) ~ NA_real_,
                                      TRUE ~ prec_pre_15)) %>% 
  mutate(prec_pre_7 = case_when(is.na(temp_pre_7) & is.na(hum_pre_7) & is.na(wind_pre_7) ~ NA_real_,
                                      TRUE ~ prec_pre_7)) %>% 
  mutate(prec_cont = case_when(is.na(temp_cont) & is.na(hum_cont) & is.na(wind_cont) ~ NA_real_,
                                      TRUE ~ prec_cont))

#there are multiple date columns. however, since full dates are mostly missing, we will only keep month and year as variables
us_wildfire_clean <- us_wildfire_clean %>% 
  dplyr::select(-disc_clean_date, 
         -disc_date_pre, 
         -cont_clean_date, 
         -disc_pre_month,
         -disc_date_final,
         -cont_date_final,
         -putout_time)

#we also can reclass months into seasons, to reduce factor levels
us_wildfire_clean <- us_wildfire_clean %>% 
  mutate(season = case_when(discovery_month %in% c("Apr", "May", "Jun") ~ "Spring",
                            discovery_month %in% c("Jul", "Aug", "Sep") ~ "Summer",
                            discovery_month %in% c("Oct", "Nov", "Dec") ~ "Fall",
                            discovery_month %in% c("Jan", "Feb", "Mar") ~ "Winter")) %>% 
  select(-discovery_month)

2.2 Exploratory Analysis

~ a few key graphs, exploring data

#summarise acres per year burned
acres_per_year <- us_wildfire_clean %>% 
  group_by(disc_pre_year) %>% 
  summarise(acres_burned = sum(fire_size))

#fire size (finalized graph)
ggplot(data = acres_per_year) + 
  geom_point(aes(x = disc_pre_year, 
                 y = acres_burned, 
                 size = acres_burned, 
                 color = acres_burned)) +
  scale_color_continuous(high = "firebrick", low = "goldenrod1") +
  labs(x = "Year", y = "Total Acres Burned", 
       title = "Total acres burned per year from 1990 to 2015") +
  theme_minimal() +
  theme(legend.position = "none")

#most common causes of fire
fire_causes <- us_wildfire_clean %>% 
  group_by(stat_cause_descr) %>% 
  count()

#cause (finalized)
ggplot(data = fire_causes, aes(y = reorder(stat_cause_descr, n), x = n)) +
  geom_col(aes(fill = n)) +
  scale_fill_gradient(high = "firebrick", low = "goldenrod1") +
  labs(x = "Number of Fires", 
       y = "Cause",
       tite = "Number of fires per listed starting cause") +
  theme_minimal() +
  theme(legend.position = "none")

2.2.1 Map of regions

us_wildfire_clean$class_fac = factor(us_wildfire_clean$fire_size_class, levels = c("A", "B", "C", "D", "E", "F", "G"))

us <- map_data("world", 'usa')

state <- map_data("state")

state$region2 = "NorthEast"
state[which(state$region %in% c("alaska", "washington", "oregon", "california", "nevada", "idaho", "montana", "utah", "wyoming", "colorado")), "region2"] = "West"
state[which(state$region %in% c("arizona", "new mexico", "oklahoma", "texas")), "region2"] = "SouthWest"
state[which(state$region %in% c("north dakota", "south dakota", "nebraska", "kansas", "minnesota", "iowa", "missouri", "wisconsin", "illinois", "indiana", "michigan", "ohio")), "region2"] = "MidWest"
state[which(state$region %in% c("arkansas", "louisiana", "mississippi", "alabama", "florida", "georgia", "south carolina", "north carolina", "tennessee", "kentucky", "virginia", "west virginia")), "region2"] = "SouthEast"

#state$region = as.factor(state$region)

ggplot(data=state, aes(x=long, y=lat, group = region)) + 
  geom_polygon(aes(fill=region2)) +
  ggtitle("US Region")+
  guides(fill=guide_legend(title="Region"))+
  coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))

2.2.2 Map of fires

2.2.2.1 The spatial distribution of wildifres

ggplot() + 
  geom_polygon(data=state, aes(x=long, y=lat, group=group), color = "white", fill = "grey") + 
  geom_point(data = us_wildfire_clean, aes(x=longitude, y = latitude, color = class_fac)) +
  scale_color_brewer(palette = "YlOrRd")+
  ggtitle("US Wildfire Distribution")+
  guides(color=guide_legend(title="Wild Fire Scale"))+
  coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))

2.2.3 When we divide it to three periods, we can see the wildfire risk has been growing in Western parts of the US.

ggplot() + 
  geom_polygon(data=state, aes(x=long, y=lat, group=group), color = "white", fill = "grey") + 
  geom_point(data = us_wildfire_clean[which(us_wildfire_clean$wstation_byear < 1970),], aes(x=longitude, y = latitude, color = class_fac)) +
  scale_color_brewer(palette = "YlOrRd")+
  ggtitle("US Wildfire Distribution before 1970")+
  guides(color=guide_legend(title="Wild Fire Scale"))+
  coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))

ggplot() + 
  geom_polygon(data=state, aes(x=long, y=lat, group=group), color = "white", fill = "grey") + 
  geom_point(data = us_wildfire_clean[which(us_wildfire_clean$wstation_byear >= 1970 & us_wildfire_clean$wstation_byear < 2000),], aes(x=longitude, y = latitude, color = class_fac)) +
  scale_color_brewer(palette = "YlOrRd")+
  ggtitle("US Wildfire Distribution 1970-2000")+
  guides(color=guide_legend(title="Wild Fire Scale"))+
  coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))

ggplot() + 
  geom_polygon(data=state, aes(x=long, y=lat, group=group), color = "white", fill = "grey") + 
  geom_point(data = us_wildfire_clean[which(us_wildfire_clean$wstation_byear >= 200),], aes(x=longitude, y = latitude, color = class_fac)) +
  scale_color_brewer(palette = "YlOrRd")+
  ggtitle("US Wildfire Distribution after 2000")+
  guides(color=guide_legend(title="Wild Fire Scale"))+
  coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))

2.2.4 Density graph

ggplot() + 
  geom_density(data= us_wildfire_clean[which(us_wildfire_clean$wstation_byear <= 1970 & us_wildfire_clean$fire_size > 100),], aes(x = fire_size, y=..density..),
               alpha=.3,
               colour="dodgerblue", fill="dodgerblue") + 
  geom_density(data= us_wildfire_clean[which(us_wildfire_clean$wstation_byear >= 1970 & us_wildfire_clean$fire_size > 100 & us_wildfire_clean$wstation_byear < 2000),], aes(x = fire_size, y=..density..),
               alpha=.3,
               colour="yellow3", fill="yellow3") + 
  geom_density(data= us_wildfire_clean[which(us_wildfire_clean$wstation_byear >= 2000 & us_wildfire_clean$fire_size > 100),], aes(x = fire_size, y=..density..),
               alpha=.3,
                 colour="firebrick3", fill="firebrick3") + 
  xlim(10000, 100000) + 
  ggtitle("Wildfire Severeity")

3 Exploring Missing Data and Correlation

#missing data plot
aggr_plot <- aggr(us_wildfire_clean, 
                  col = c('navyblue','red'), 
                  numbers = TRUE, 
                  sortVars = TRUE, 
                  labels = names(us_wildfire_clean), 
                  cex.axis = .7, 
                  gap = 2, 
                  ylab = c("Histogram of missing data","Pattern"))

## 
##  Variables sorted by number of missings: 
##            Variable      Count
##           fire_name 0.50282019
##            hum_cont 0.42786638
##           wind_cont 0.41558884
##           temp_cont 0.41002139
##           prec_cont 0.41002139
##  vegetation_classed 0.17480307
##           hum_pre_7 0.14329476
##          hum_pre_15 0.12783234
##          wind_pre_7 0.12250802
##          temp_pre_7 0.11173782
##          prec_pre_7 0.11173782
##         wind_pre_15 0.10663231
##         temp_pre_15 0.09571623
##         prec_pre_15 0.09571623
##          hum_pre_30 0.09413595
##         wind_pre_30 0.07308179
##         temp_pre_30 0.06216571
##         prec_pre_30 0.06216571
##           fire_size 0.00000000
##     fire_size_class 0.00000000
##    stat_cause_descr 0.00000000
##            latitude 0.00000000
##           longitude 0.00000000
##               state 0.00000000
##       disc_pre_year 0.00000000
##       wstation_usaf 0.00000000
##          dstation_m 0.00000000
##       wstation_wban 0.00000000
##      wstation_byear 0.00000000
##      wstation_eyear 0.00000000
##          vegetation 0.00000000
##            fire_mag 0.00000000
##        weather_file 0.00000000
##          remoteness 0.00000000
##              region 0.00000000
##           year_diff 0.00000000
##              season 0.00000000
##           class_fac 0.00000000
#It is likely that weather columns are correlated. to investigate this we create a correlation matrix

#create a dataframe of weather
weather <- us_wildfire_clean %>% 
  dplyr::select(temp_pre_30:prec_cont)

#create a correlation matrix (omitting NAs)
cor_matrix <- cor(weather, use = "complete.obs")

#create a visualization
corrplot(cor_matrix, type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)

#we can see that their is a strong correlation with each set of variables. i.e. temperature 7 days before the fire is correlated with temp 30 days before the fire. 

#consider PCA on wind, PCA on hum, PCA on temp

3.1 Model Preparation

3.1.1 Splitting Data

We split our data into 80% training and 20% testing data. Because fire size is heavily skewed towards smaller fires, we used stratified sampling.

There are xyz observations in the training set and xyz observations in the test set.

#first we make a dataframe containing only variables we want to use in our models
fire_modeling_df <- us_wildfire_clean %>%
  dplyr::select(-fire_name, #remove fire name
         -fire_size_class, #remove fire size class
         -class_fac, #which is also a size class
         -state, #because we have regions
         -disc_pre_year, #because we have year_diff to represent year
         -wstation_usaf, #remove weather station name
         -wstation_wban, #remove this (not described in codebook)
         -wstation_byear, #remove station year installed
         -wstation_eyear, #remove station year ended data recording
         -weather_file, #remove name of weather file
         -dstation_m, #remove distance of weather station to fire
         -vegetation #remove vegetaiton because we already have it classed
         )

#define split parameters
us_wildfire_split <- fire_modeling_df %>% 
  initial_split(prop = 0.8, strata = "fire_size")

#write split data to data frames
fire_train <- training(us_wildfire_split)
fire_test <- testing(us_wildfire_split)

#set up folds in training data for cross validation
train_folds <- vfold_cv(fire_train, v = 10, repeats = 5)

3.1.2 Imputing Missing Data

#select weather cols     
weather_train <- fire_train %>%
  select(temp_pre_30:prec_cont)

weather_test <- fire_test %>%
  select(temp_pre_30:prec_cont)

#select cols not in weather
notweather_train <- fire_train %>%
  select(-colnames(weather_train))

notweather_test <- fire_test %>%
  select(-colnames(weather_test))

#imputation of weather data
weather_train_imputed <- imputePCA(weather_train, ncp=4)


weather_test_imputed <- imputePCA(weather_test, ncp=4)

3.1.3 Principle Components

#Do PCA on both test and train data separately
weather_train_PCA <- weather_train_imputed$completeObs %>%
  scale(center = T, scale = T) %>%  #first scale and center the data
  prcomp(rank. = 4) #do PCA


#weather_train_PCA$x

#Do PCA on both test and train data separately
weather_test_PCA <- weather_test_imputed$completeObs %>%
  scale(center = T, scale = T) %>%  #first scale and center the data
  prcomp(rank. = 4)

#Make a PCA bi-plot
autoplot(weather_train_PCA, 
         data = weather_train,
         loadings = TRUE, 
         loadings.label = TRUE,
         loadings.label.hjust = 1.2) +
  theme_classic() +
  labs(caption = "Principle Component Analysis Bi-plot of Weather Training Data")

#put data back together

#bind imputed weather data to rest of rows
fire_train_complete <- cbind(notweather_train, weather_train_PCA$x) %>% 
  na.omit()

fire_test_complete <- cbind(notweather_test, weather_test_PCA$x) %>% 
  na.omit()

3.1.4 Linear regression for each region

summary(lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete))
## 
## Call:
## lm(formula = fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -11535  -2132   -972    198 527469 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)   900.96     183.42   4.912         0.0000009073 ***
## PC1          -946.90      34.34 -27.570 < 0.0000000000000002 ***
## PC2           -83.98      37.23  -2.256               0.0241 *  
## PC3            91.64      43.42   2.111               0.0348 *  
## PC4          -717.13      52.60 -13.635 < 0.0000000000000002 ***
## year_diff      66.81      12.10   5.522         0.0000000338 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12580 on 27223 degrees of freedom
## Multiple R-squared:  0.03419,    Adjusted R-squared:  0.03402 
## F-statistic: 192.8 on 5 and 27223 DF,  p-value: < 0.00000000000000022
summary(lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region == "SouthWest"),]))
## 
## Call:
## lm(formula = fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region == 
##     "SouthWest"), ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -9453  -2429   -859    516 529317 
## 
## Coefficients:
##             Estimate Std. Error t value          Pr(>|t|)    
## (Intercept)  -497.17     526.14  -0.945           0.34473    
## PC1          -615.87      83.10  -7.411 0.000000000000142 ***
## PC2          -598.26      85.69  -6.981 0.000000000003224 ***
## PC3           -38.79     210.87  -0.184           0.85404    
## PC4          -684.99     163.45  -4.191 0.000028186488753 ***
## year_diff     100.09      30.76   3.254           0.00115 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13990 on 6320 degrees of freedom
## Multiple R-squared:  0.02996,    Adjusted R-squared:  0.02919 
## F-statistic: 39.04 on 5 and 6320 DF,  p-value: < 0.00000000000000022
summary(lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region == "SouthEast"),]))
## 
## Call:
## lm(formula = fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region == 
##     "SouthEast"), ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -2524   -358   -215    -50 308819 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   45.311     90.634   0.500  0.61713    
## PC1          -83.689     31.245  -2.679  0.00740 ** 
## PC2          -32.851     22.389  -1.467  0.14232    
## PC3          -49.264     18.405  -2.677  0.00745 ** 
## PC4          119.955     31.773   3.775  0.00016 ***
## year_diff     18.995      6.097   3.116  0.00184 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4163 on 12414 degrees of freedom
## Multiple R-squared:  0.003414,   Adjusted R-squared:  0.003013 
## F-statistic: 8.505 on 5 and 12414 DF,  p-value: 0.00000004755
summary(lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region == "MidWest"),]))
## 
## Call:
## lm(formula = fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region == 
##     "MidWest"), ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -3260  -1047   -549    -22  81411 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) 1050.411    237.361   4.425            0.0000101 ***
## PC1         -480.255     54.636  -8.790 < 0.0000000000000002 ***
## PC2          -18.515     45.774  -0.404             0.685888    
## PC3           41.233     51.842   0.795             0.426487    
## PC4         -254.899     76.985  -3.311             0.000943 ***
## year_diff      5.654     13.775   0.410             0.681510    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4177 on 2436 degrees of freedom
## Multiple R-squared:  0.033,  Adjusted R-squared:  0.03102 
## F-statistic: 16.63 on 5 and 2436 DF,  p-value: 0.0000000000000003531
summary(lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region == "West"),]))
## 
## Call:
## lm(formula = fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region == 
##     "West"), ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -17945  -7819  -4703  -1010 495921 
## 
## Coefficients:
##             Estimate Std. Error t value        Pr(>|t|)    
## (Intercept)   963.14     906.89   1.062         0.28828    
## PC1         -1335.93     205.11  -6.513 0.0000000000819 ***
## PC2           693.22     234.57   2.955         0.00314 ** 
## PC3           423.89     596.90   0.710         0.47765    
## PC4           -86.46     396.12  -0.218         0.82724    
## year_diff     277.22      56.64   4.895 0.0000010191622 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24940 on 4346 degrees of freedom
## Multiple R-squared:  0.02072,    Adjusted R-squared:  0.01959 
## F-statistic: 18.39 on 5 and 4346 DF,  p-value: < 0.00000000000000022
summary(lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region == "NorthEast"),]))
## 
## Call:
## lm(formula = fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff, data = fire_train_complete[which(fire_train_complete$region == 
##     "NorthEast"), ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -189.0  -69.2  -34.2    1.7 9065.6 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  73.15430   34.58647   2.115 0.034567 *  
## PC1         -35.91001    9.55715  -3.757 0.000178 ***
## PC2           0.87420    6.41421   0.136 0.891608    
## PC3          -9.31642    4.91710  -1.895 0.058303 .  
## PC4          -2.26750    6.23670  -0.364 0.716223    
## year_diff    -0.06306    2.07576  -0.030 0.975767    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 441.2 on 1683 degrees of freedom
## Multiple R-squared:  0.0118, Adjusted R-squared:  0.008864 
## F-statistic: 4.019 on 5 and 1683 DF,  p-value: 0.001252

3.2 Modeling As a Regression Problem

3.2.1 K Nearest Neighbor

KNN (short for K Nearest Neighbor) is a non-linear algorithm that works for both classification and regression problems. The basic premise behind the model is that it predicts the dependent variable based on how similar they are to other observations where dependent variable is known.

KNN works by calculating euclidean distance between observations, so all inputs have to be numeric. Therefore, we have to do some pre-model data changes to our training and test sets. For both test data and training data we dummy code categorical variables, then we center and scale the data. As before, we assume that the training and test data are completely separated, so we process the two data sets separately.

#first we dummy code categorical variables (what about ordinal?)
knn_ready_train <- dummy_cols(fire_train_complete, select_columns = c("stat_cause_descr", "region", "vegetation_classed", "season")) %>% 
  #then we get rid of non-dummy coded variables
  select(-stat_cause_descr, - region, - vegetation_classed, -season) %>% 
  #then convert back to a data frame (as output for `dummy_cols` is a matrix)
  as.data.frame()

#then we center and scale the data, except our outcome
knn_ready_train[,-1] <- scale(knn_ready_train[,-1], center = T, scale = T)


#of course our next step is to do the same to the test data
knn_ready_test <- dummy_cols(fire_test_complete, select_columns = c("stat_cause_descr", "region", "vegetation_classed", "season")) %>% 
  select(-stat_cause_descr, - region, - vegetation_classed, -season) %>% 
  as.data.frame()

knn_ready_test[,-1] <- scale(knn_ready_test[,-1], center = T, scale = T)

Next we split up the training and test data into a data frames that have only independent variables and dependent variables (in this case fire_size). We do this for both the test and the training data.

#YTrain is the true values for fire size on the training set 
YTrain = knn_ready_train$fire_size

#XTrain is the design matrix for training data
XTrain =  knn_ready_train %>% 
  select(-fire_size)
 
#YTest is the true value for fire_size on the test set
YTest = knn_ready_test$fire_size

#Xtest is the design matrix for test data
XTest = knn_ready_test %>% 
  select(-fire_size)

Then we use leave one out cross validation to determine the best number of neighbors to consider. A low number of neighbors considered results in a highly flexible model, while a higher number results in a less flexible model. For this process we built a function which we enter a starting value of K, an ending value of K, and the sampling interval. The result is a data frame of MSE values for each value of K that we test. This process is computationally intensive so we automatically saved results to a csv.

#make a function that saves KNN LOOCV results for different values of K
knn_loocv <- function(startk, endk, interval)
  {
  
#set up possible number of nearest neighbors to be considered 
allK = seq(from = startk, to = endk, by = interval)

#create a vector of the same length to save the MSE in later
k_mse = rep(NA, length(allK))

#for each number in allK, use LOOCV to find a validation error  
for (i in allK){  
  #loop through different number of neighbors
  #predict on the left-out validation set
  pred.Yval = knn.cv(train = XTrain, cl = YTrain, k = i) 
  #find the mse for each value of k
  k_mse[i] = mean((as.numeric(pred.Yval) - YTrain)^2)
}

#save as a data frame and filter out NAs (caused by skipping values of k if interval is larger than 1)
k_mse <- as.data.frame(k_mse) %>% 
  filter(!is.na(k_mse))

#bind with k value
knn_loocv_results <- cbind(as.data.frame(allK), k_mse)

#save MSE as CSV (because the cross validation process takes a long time)
write_csv(knn_loocv_results, paste0("model_results/knn/knn_mse_k", startk,"_k", endk, "_by", interval, ".csv"))

}

#we tried several different sets of k
knn_loocv(startk = 1, endk = 20, interval = 1)
knn_loocv(startk = 1, endk = 500, interval = 20)
knn_loocv(startk = 481, endk = 490, interval = 1)

Next, we go through our results for all the values of K that we tested. We find that the MSE increases as K increases

#read in all the stored k values
knn_mse_k1_k500_by20 <- read_csv(here("model_results", "knn", "knn_mse_k1_k500_by20.csv"))

knn_mse_k1_k20_by1 <- read_csv(here("model_results", "knn", "knn_mse_k1_k20_by1.csv"))

#plot MSE for values of k
plot(knn_mse_k1_k500_by20$allK, knn_mse_k1_k500_by20$k_mse, type = "l", xlab = "k", ylab = "MSE", main = "MSE for K 1 through 500 at intervals of 20")

When we look at K 1-20 we get the lowest MSE at around 3 before it starts increasing.

plot(knn_mse_k1_k20_by1$allK, knn_mse_k1_k20_by1$k_mse, type = "l", xlab = "k", ylab = "MSE", main = "MSE for K 1 through 20 at intervals of 1")

Just to confirm, we will print out the best number of neighbors.

#best number of neighbors
numneighbor = max(knn_mse_k1_k20_by1$allK[knn_mse_k1_k20_by1$k_mse == min(knn_mse_k1_k20_by1$k_mse)])

#print best number of neighbors
numneighbor
## [1] 3

Now that we’ve found the best number of nearest neighbors to consider, we try out KNN on our test data! Below is the test MSE and the test RMSE.

#fit KNN model with best neighbor value to test data
pred.YTest = knn(train = XTrain, test = XTest, cl = YTrain, k = numneighbor)

#predict KNN
test_MSE <- mean((as.numeric(pred.YTest) - YTest)^2)

#print test MSE
test_MSE
## [1] 83563638
#print test RMSE
sqrt(test_MSE)
## [1] 9141.315

Over all, this model performs okay.

3.2.2 Boosted Tree

#train boosted tree model
fire_size_boost = gbm(fire_size~., 
                      data = fire_train,
                      n.trees = 500, 
                      interaction.depth = 4
                    )

#the model summary creates a pretty visualization
summary(fire_size_boost)

#calculate training error

#predict values using training data
predictions_train_boost <- predict(fire_size_boost, data = fire_train)

#calculate rmse for training data
RMSE(predictions_train_boost, fire_train$fire_size)

#calculate test error

#predict values using test data
predictions_test_boost <- predict(fire_size_boost, data = fire_test)

#calculate rmse for training data
RMSE(predictions_test_boost, fire_test$fire_size)

3.2.3 Random Forest

#train model
fire_size_rf = randomForest(fire_size ~ ., #writing the formula
                          data = fire_train_complete, #specifying training data to be used
                          mtry = 9, #setting number of variables to randomly sample per each split
                          ntree= 500, #setting number of trees
                          mode = "regression", #specifying regression
                          na.action = na.omit, #specifying what to do with NAs
                          importance = TRUE #specifying importance of variables should be assessed
                          )
#plot error
plot(fire_size_rf)

#plot variable importance
varImpPlot(fire_size_rf, 
           sort = T, 
           main = "Variable Importance for fire size random forest model", 
           n.var = 5)

#calculate training error

#predict values using training data
predictions_train_rf <- predict(fire_size_rf, data = fire_train)

#calculate rmse for training data
RMSE(predictions_train_rf, fire_train$fire_size)

#calculate test error

#predict values using test data
predictions_test_rf <- predict(fire_size_rf, data = fire_test)

#calculate rmse for training data
RMSE(predictions_test_rf, fire_test$fire_size)

what type of modeling will we do? we must try 4 types

  • neural network

4 Results

predictions without climate change

How do predictions change with 2.5C increase in temp?

5 Conclusion